2016 Olympics medal count acquisition

In this notebook, we acquire the current medal count from the web.

1. List of sports


In [ ]:
from bs4 import BeautifulSoup
import urllib
r = urllib.urlopen('http://www.bbc.com/sport/olympics/rio-2016/medals/sports').read()
soup = BeautifulSoup(r,"lxml")

sports_span = soup.findAll("span",{"class","medals-table-by-sport__sport-name"})
sports_names = []
sports_names_format = []
for s in sports_span:
    sports_names_format.append(str(s))
    sports_names.append(str(s).lower().replace(" ","-")[48:-7])
print sports_names


['archery', 'athletics', 'badminton', 'basketball', 'boxing', 'canoeing', 'cycling', 'diving', 'equestrian', 'fencing', 'football', 'golf', 'gymnastics', 'handball', 'hockey', 'judo', 'modern-pentathlon', 'rowing', 'rugby-sevens', 'sailing', 'shooting', 'swimming', 'synchronised-swimming', 'table-tennis', 'taekwondo', 'tennis', 'triathlon', 'volleyball', 'water-polo', 'weightlifting', 'wrestling']

2. HTMLs for each sport's medal table


In [ ]:
# Save html for each sport
htmls = {}
for s in sports_names:
    htmls[s] = urllib.urlopen('http://www.bbc.com/sport/olympics/rio-2016/medals/sports/'+s+'#'+s).read()

In [ ]:
# Find table html for each sport
thtmls = {}
for s in sports_names:
    soupsp = BeautifulSoup(htmls[s],"lxml")
    thtmls[s] = soupsp.findAll("table",{"class","medals-table-by-sport__countries_table"})

3. Scrape medals for each country and sport


In [ ]:
# For every sport, scrape medal data
import re
medal_names = ['gold','silver','bronze']
medals = {}
sports_countries = {}
all_countries_format = []
for s in sports_names:
    print s
    medals[s] = {}
    h = str(thtmls[s])
    if not thtmls[s]:
        print 'no medals yet'
    else:
        # Find countries of interest
        pattern = r"<abbr class=\"abbr-on medium-abbr-off\" title=\""
        pmatch = re.finditer(pattern, h)
        countries = []
        for i,match in enumerate(pmatch):
            country = h[int(match.end()):int(match.end())+200].rsplit('"')[0]
            all_countries_format.append(country)
            countries.append(country.lower().replace(" ","-"))
        sports_countries[s] = countries
        for c in sports_countries[s]:
            if c == 'great-britain-&amp;-n.-ireland':
                ci1 = 'great-britain-and-northern-ireland'
                medals[s][c] = {}
                for m in medal_names:
                    pattern = r"<abbr class=\"abbr-on medium-abbr-off\" title=\".{,800}" + m + ".{,150}" + ci1 + "\">"
                    gendermatch = re.finditer(pattern, h)
                    for i,match in enumerate(gendermatch):
                        medals[s][c][m] = int(h[int(match.end()):int(match.end())+3])
            else:
                ci = c
                medals[s][ci] = {}
                for m in medal_names:
                    pattern = r"<abbr class=\"abbr-on medium-abbr-off\" title=\".{,500}" + m + ".{,150}" + ci + "\">"
                    gendermatch = re.finditer(pattern, h)
                    for i,match in enumerate(gendermatch):
                        medals[s][ci][m] = int(h[int(match.end()):int(match.end())+3])
        print medals[s]

Create dataframe of medals


In [ ]:
import numpy as np
all_countries_format = list(np.unique(all_countries_format))
all_countries_format.remove('Great Britain &amp; N. Ireland')
all_countries_format.append('Great Britain')
all_countries_format_list = list(np.unique(all_countries_format))

In [ ]:
import pandas as pd

# Create an empty dataframe
columns = ['country','sport','medal','N']
df = pd.DataFrame(columns=columns)

# Identify all countries with at least 1 medal
countries_list = list(set(reduce(lambda x,y: x+y,sports_countries.values())))
countries_list = sorted(countries_list)
# Fill dataframe
for s in sports_names:
    if thtmls[s]:
        for i,c in enumerate(countries_list):
            ci = all_countries_format_list[i]
            for m in medal_names:
                if c in sports_countries[s]:
                    rowtemp = [ci, s, m, medals[s][c][m]]
                else:
                    rowtemp = [ci, s, m, 0]
                dftemp = pd.DataFrame([rowtemp], columns=columns)
                df =df.append(dftemp)

Save dataframe


In [ ]:
df.to_csv('now_medals.csv')